# Replication files for 
# Masterson and Lehmann
# "Refugees, Mobilization, and Humanitarian Aid: Evidence from the Syrian Refugee Crisis in Lebanon"

# File: 0 Dataset construction

####
# ID guide
#	surveyID is the number that I assigned to each household. Only the primary respondents have a surveyID field. 
# The non-primary respondents have NA for surveyID, there are 1361 unique IDs, ranging from 1 to ~1860. 
# This called Q3 until around code line 182 when it's renamed.
#
# QUEST_ID: ranging from 1-1361. Every member of the HH has a QUEST_ID value, the same for all HH members

remove(list = ls())

setwd('~/Dropbox/replication/refugees-mobilization-aid-replication-files/')

require("foreign")
require("data.table")
require("FactoMineR")

  
# Define standardization function
standardize <- function(variable){
  demeaned <- variable - mean(na.omit(variable))
  sd <- sqrt(var(na.omit(variable)))
  return(demeaned/sd)
}

####
# Load Data ---------------------------------------------------------------

#Load primary data file to identify observations with ID==999
data <- read.csv("data/Masterson_Lehmann_IRC_Survey_Data.csv", header = TRUE, fileEncoding="latin1", na.strings=c(""," ",".","NA", "-96", "-97", "-98"))


# 999 was used as an NA value by the survey company during data entry, 
# But we cannot include 999 as an NA string in read.csv command since it would conflict with two ID values: QUEST_ID==999 and Q3==999. 
# So we save their corresponding ID values and then add them manually after we reload the data below.

# Identify survey IDs == 999
nines_mistakeQ3 <- na.omit(data[which(data$QUEST_ID==999), "Q3"])
# Identify questionnaire IDs == 999
nines_mistakeQUEST_ID <- data[which(data$Q3==999), "QUEST_ID"]

#Reload primary data file with 999 as an NA value
data <- read.csv("data/Masterson_Lehmann_IRC_Survey_Data.csv",header = TRUE, fileEncoding="latin1", na.strings=c(""," ",".","NA", "-96", "-97", "-98", "999"))

# Manually input 999s for the values that were removed during the NA.STRINGS process
data[which(data$QUEST_ID==nines_mistakeQUEST_ID & data$Q9c==1), "Q3"] <- 999
data[which(data$Q3==nines_mistakeQ3), "QUEST_ID"] <- 999

### Other data files to merge
### 1. Cas_Codes (Cadastral IDs for administrative geographic units)
#  This files indicates which cas_codes received treatment
cascodeTreat <- read.dta("data/cadastral_treatment_indicator.dta")

##   2. Sample data provided to the survey firm (II)
sampleDataforII <- read.csv("data/2014_0424_full_sample_without_names.csv")

##   3. UN population data pre-treatment, October 2013
UN_Pop_data <- read.csv("data/registered_syrians_by_village_by_month_since_Oct2013.csv")

###   4.    UN pre-treatment covariate background variables 
#January 2013 UNHCR Data
HHCompData <- read.csv("data/unhcr_survey_data_jan2013.csv", na.strings=c(""," ",".","NA", "-96", "-97", "-98"))
# October 2013 Pre-program eligibility for other aid programs
HHAidEligibility <- read.csv("data/wfp_data_other_aid_programs.csv", na.strings=c(""," ",".","NA", "-96", "-97", "-98"))

###   5.    Lebanese sect population data from Mourad (2018)
sect <- read.csv("data/mourad_sect_data_trimmed.csv")
colnames(sect)[which(colnames(sect)=="ACS_CODE")] <- "cas_code"

# Aggregate to cas code level
# Mourad data is at municipality level
# and some CAS codes have multiple municipalities
sect[, "Sect_MAJ"] <- as.numeric(as.character(sect[, "Sect_MAJ"]))

# Replace NAs with an indicator for missing
table(sect[, "Sect_MAJ"], useNA='always')
sect[is.na(sect$Sect_MAJ), 'Sect_MAJ'] <- 7

# 0 = Mixed, 1 = Armenian , 2 = Christian, 3 = Druze, 4 = Shia, 5 = Sunni, 6 = Other, 7 = Missing

num_mun <- c()
# Identify cas codes that appear more than once
for(i in 1:length(unique(sect$cas_code))){
  num_mun[i] <- length(which(sect$cas_code==unique(sect$cas_code)[i]))
}
doubles <- unique(sect$cas_code)[num_mun>1]

# Remove rows with more than one municipality per cas code
sect <- sect[-which(sect$cas_code %in% doubles), ]

# Add back one line for each cas code, coded as missing
insert <- cbind(doubles, rep(7, length(doubles)))
colnames(insert) <- colnames(sect)
sect <- rbind(sect, insert)



# Define indicator for primary respondent observation
#  Since there are rows in the data for every household member
PrimaryRespondent <- ifelse(test = is.na(data[, 'Project']), yes = 0, no =  1)


#### The questionnaire ID denotes the unique ID number that the survey company assigned to each interview. One interview, however, is missing a questionnaire ID
# Create a new unique ID for that household
data[which(is.na(data[, "QUEST_ID"])), "QUEST_ID"] <- max(data[, "QUEST_ID"], na.rm = TRUE)+1


####
# Quality Checks ----------------------------------------------------------

## Confirm that the primary respondent holder calculated correctly
sum(PrimaryRespondent, na.rm=TRUE) == 1361

# Confirm that we have the same number of primary respondents as unique households
all.equal( sum(na.omit(PrimaryRespondent)), length(unique(data[, "QUEST_ID"]))  )

# We do not have all equal
# Identify the problematic household(s)
badQ3 <- as.numeric(names(which(table(data[, "Q3"])>=2)))

# Identify the questionnaire ID associated with this Q3
# Because all respondents have a QUEST_ID value, but only primary respondents have a Q3 value.
badQ3_QuestID <- data[which(data[, "Q3"]==badQ3), "QUEST_ID"]

# Remove the observations that have the same Q3 value
data <- data[-which(data[, "QUEST_ID"] %in% badQ3_QuestID), ]

## Identify any Households that do not have a primary respondent marked
# Rather than using some more complicated code to identify which HHs are missing a "1" for question 9C, 
# we just identify which ones do not have a value for Q3 (survey ID) for any individual, 
# where only primary respondents have a value for Q3
## Do any households not have a primary respondent?
noPrimaryRespondent_noQ3 <- as.numeric(names(which(table(data$QUEST_ID, is.na(data$Q3))[, 1]==0)  )   )

# Observation with QUEST_ID==1113 has no primary respondent, drop it
data <- data[-which(data[, "QUEST_ID"] %in% noPrimaryRespondent_noQ3), ]   

# Any households that have more than one primary respondent?
which(table(data$QUEST_ID, is.na(data$Q3))[, 1]>1)

# Rerun: Define indicator for primary respondent observation
PrimaryRespondent <- c()
for(i in 1:dim(data)[1]){
  ifelse(data[i, 'Project']==" ", PrimaryRespondent[i] <- 0, PrimaryRespondent[i] <- 1)
}

# Confirm that we have the same number of primary respondents as unique households
all.equal( sum(na.omit(PrimaryRespondent)), length(unique(data[, "QUEST_ID"]))  )
# Now this step is correct if these are equal.

## Redefine primary respondent indicator
# Define indicator for primary respondent observation
PrimaryRespondent2 <- ifelse(test = is.na(data[, 'Project']), yes = 0, no =  1)

# Append Primary respondent indicator variable to dataset
data <- cbind(data, PrimaryRespondent2)

## Confirm 
all.equal(dim(data), dim(data[is.na(data$PrimaryRespondent2)==0, ]))


####
# Merge Datasets ----------------------------------------------------------

# Define data.frames as data.tables for faster merging
data <- data.table(data)
sampleDataforII <- data.table(sampleDataforII)
cascodeTreat <- data.table(cascodeTreat)
UN_Pop_data <- data.table(UN_Pop_data)
HHCompData <- data.table(HHCompData)
HHAidEligibility <- data.table(HHAidEligibility)
sect <- data.table(sect)

## Rename columns so the surveyID number is labeled similarly across datasets.
setnames(data, "Q3", "surveyID")
setnames(sampleDataforII, "X", "surveyID")

## Create survey ID - processing group number data.table
surveyID_ProcessingGroupNumber <- sampleDataforII[,1:2, with = FALSE]

## Define the key for merge
setkey(data, surveyID)
setkey(sampleDataforII, surveyID)
setkey(surveyID_ProcessingGroupNumber, ProcessingGroupNumber)
setkey(HHCompData, ProcessingGroupNumber)
setkey(HHAidEligibility, ProcessingGroupNumber)

## Add surveyID to HHCompData and HHAidEligibility
HHCompData2 <- merge(surveyID_ProcessingGroupNumber, HHCompData)
HHAidEligibility2 <- merge(surveyID_ProcessingGroupNumber, HHAidEligibility)

## Define the key for merge
setkey(HHCompData2, surveyID)
setkey(HHAidEligibility2, surveyID)

# Drop the processing group number columns, reset key to survey ID
HHCompData2[, ProcessingGroupNumber:=NULL]
HHAidEligibility2[, ProcessingGroupNumber:=NULL]

## The dim of the new merged data set should be
c(dim(data)[1], dim(data)[2] + dim(sampleDataforII)[2] -1 )
dataFull_holder <- merge(data, sampleDataforII, all.x = TRUE)

### Add CAS_CODE info
setkey(cascodeTreat, cas_code)
setnames(dataFull_holder, "CAS_CODE", "cas_code")
setnames(UN_Pop_data, "CAS_CODE", "cas_code")
setkey(dataFull_holder, cas_code)
setkey(UN_Pop_data, cas_code)

## The dim of the new merged data set should be
c(dim(data)[1], dim(dataFull_holder)[2] + dim(cascodeTreat)[2] -1 )

# Run the merge
dataFull_holder1_2 <- merge(dataFull_holder, cascodeTreat, all.x = TRUE)

# Are the dimensions of the merge correct?
all.equal(dim(dataFull_holder1_2), c(dim(data)[1], dim(dataFull_holder)[2] + dim(cascodeTreat)[2] -1 ))

# Post-merge quality check
## Do the empowerment intro texts match up across merges?
holder <- na.omit(dataFull_holder1_2[, surveyID])[1:500]
all.equal(sampleDataforII[holder, empowerment_intro_text], na.omit(dataFull_holder1_2[, empowerment_intro_text])[1:500] )

### Merge the UN household composition and aid eligibility data 
setkey(dataFull_holder1_2, surveyID)
dataFull_holder1_3 <- merge(dataFull_holder1_2, HHCompData2, all.x = TRUE)
dataFull_holder2 <- merge(dataFull_holder1_3, HHAidEligibility2, all.x = TRUE)


### Add UN population data 
UN_byCAS <- data.table(aggregate(cbind(Oct2013Pop, Nov2013Pop, Dec2013Pop, Jan2014Pop, Feb2014Pop, Mar2014Pop, Apr2014Pop, May2014Pop)  ~ cas_code, data= UN_Pop_data, sum, na.rm=TRUE, na.action=NULL))
setkey(UN_byCAS, cas_code)
setkey(dataFull_holder2, cas_code)

#Merge the population data with the main data
dataFull_holder3 <- merge(dataFull_holder2, UN_byCAS, all.x = TRUE)

# Post-merge quality check
# Confirm that we didn't lose an obsevations in the merge
all.equal(dim(dataFull_holder2)[1], dim(dataFull_holder3)[1])

# Redefine name of main working object
dataFull <- dataFull_holder3

## Now set the order back to by QUEST_ID
setkey(dataFull, QUEST_ID)


####
# Variable Creation -------------------------------------------------------

## Define treatment indicators 

## Treatment Assignment
dataFull[, Treatment := dataFull[, WINTELG_oct13] ] 

# Define self-reported treatment
dataFull[, TreatmentSelfReport := dataFull[, Q27] ]

# Define variables: fighting age and fighting age male variables
# Variable Q9a is birth year
# Variable Q9b is gender, 1 = female, 2 = male
dataFull[, fightingAge15_55 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-55, 1, 0)]
dataFull[, fightingAgeMale15_55 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-55 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge15_60 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-60, 1, 0)]
dataFull[, fightingAgeMale15_60 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-60 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge18_45 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-45, 1, 0)]
dataFull[, fightingAgeMale18_45 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-45 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge18_50 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-50, 1, 0)]
dataFull[, fightingAgeMale18_50 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-50 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge18_40 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-40, 1, 0)]
dataFull[, fightingAgeMale18_40 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-40 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge20_35 := ifelse(dataFull[, Q9a] < 2014-20 & dataFull[, Q9a] > 2014-35, 1, 0)]
dataFull[, fightingAgeMale20_35 := ifelse(dataFull[, Q9a] < 2014-20 & dataFull[, Q9a] > 2014-35 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge18_25 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-25, 1, 0)]
dataFull[, fightingAgeMale18_25 := ifelse(dataFull[, Q9a] < 2014-18 & dataFull[, Q9a] > 2014-25 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge35_50 := ifelse(dataFull[, Q9a] < 2014-35 & dataFull[, Q9a] > 2014-50, 1, 0)]
dataFull[, fightingAgeMale35_50 := ifelse(dataFull[, Q9a] < 2014-35 & dataFull[, Q9a] > 2014-50 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge40_50 := ifelse(dataFull[, Q9a] < 2014-40 & dataFull[, Q9a] > 2014-50, 1, 0)]
dataFull[, fightingAgeMale40_50 := ifelse(dataFull[, Q9a] < 2014-40 & dataFull[, Q9a] > 2014-50 & dataFull[, Q9b] == 2, 1, 0)]

dataFull[, fightingAge15_20 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-20, 1, 0)]
dataFull[, fightingAgeMale15_20 := ifelse(dataFull[, Q9a] < 2014-15 & dataFull[, Q9a] > 2014-20 & dataFull[, Q9b] == 2, 1, 0)]


#########
### Sum the number of Fighting Age Men in each HH 20-35
holder20_35 <- data.table(aggregate( fightingAgeMale20_35 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder20_35, "fightingAgeMale20_35", "SumFightingAgeMale20_35")
setkey(holder20_35, "QUEST_ID")
dataFull <- merge(dataFull, holder20_35)

### Sum the number of Fighting Age Men in each HH 18-45
holder18_45 <- data.table(aggregate( fightingAgeMale18_45 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder18_45, "fightingAgeMale18_45", "SumFightingAgeMale18_45")
setkey(holder18_45, "QUEST_ID")
dataFull <- merge(dataFull, holder18_45)

### Sum the number of Fighting Age Men in each HH 18-50
holder18_50 <- data.table(aggregate( fightingAgeMale18_50 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder18_50, "fightingAgeMale18_50", "SumFightingAgeMale18_50")
setkey(holder18_50, "QUEST_ID")
dataFull <- merge(dataFull, holder18_50)

### Sum the number of Fighting Age Men in each HH 18-40
holder18_40 <- data.table(aggregate( fightingAgeMale18_40 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder18_40, "fightingAgeMale18_40", "SumFightingAgeMale18_40")
setkey(holder18_40, "QUEST_ID")
dataFull <- merge(dataFull, holder18_40)
#	length(unique(dataFull[, QUEST_ID])) #10-31

#########
### Sum the number of Fighting Age People (both genders) in each HH 20-35
holder20_35 <- data.table(aggregate( fightingAge20_35 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder20_35, "fightingAge20_35", "SumFightingAge20_35")
setkey(holder20_35, "QUEST_ID")
dataFull <- merge(dataFull, holder20_35)

### Sum the number of Fighting Age People (both genders) in each HH 18-50
holder18_50 <- data.table(aggregate( fightingAge18_50 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder18_50, "fightingAge18_50", "SumFightingAge18_50")
setkey(holder18_50, "QUEST_ID")
dataFull <- merge(dataFull, holder18_50)

### Sum the number of Fighting Age People (both genders) in each HH 18-45
holder18_45 <- data.table(aggregate( fightingAge18_45 ~ QUEST_ID , data = dataFull , sum ))
setnames(holder18_45, "fightingAge18_45", "SumFightingAge18_45")
setkey(holder18_45, "QUEST_ID")
dataFull <- merge(dataFull, holder18_45)

#### Create a ratio of fighting age men to all fighting age individuals
ratio_holder20_35 <- ifelse(dataFull$SumFightingAge20_35>0, 
                            yes = dataFull$SumFightingAgeMale20_35/dataFull$SumFightingAge20_35, 
                            no = 0)
dataFull[, ratio20_35 := ratio_holder20_35]

# # 18_45 ratio
ratio_holder18_45 <- ifelse(dataFull$SumFightingAge18_45>0, 
                            yes = dataFull$SumFightingAgeMale18_45/dataFull$SumFightingAge18_45, 
                            no = 0)
dataFull[, ratio18_45 := ratio_holder18_45]

# # 18_50 ratio
ratio_holder18_50 <- ifelse(dataFull$SumFightingAge18_50>0, 
                            yes = dataFull$SumFightingAgeMale18_50/dataFull$SumFightingAge18_50, 
                            no = 0)
dataFull[, ratio18_50 := ratio_holder18_50]


####
# Extrapolate values from head of household rows to all household rows --------

# Extrapolate the altitude variable for all observations
not <- dataFull[which(is.na(dataFull[, WINTELEV_oct13])==0), ]
for (i in 1:length(dataFull[, QUEST_ID])){
  id <- dataFull[i, QUEST_ID]  # The id number we want to match 
  idMatch <- which(not[, QUEST_ID]==id) #
  dataFull[i, WINTELEV_oct13 := not[idMatch, WINTELEV_oct13] ]
}

### Extrapolate Treatment Variable 
not <- dataFull[which(is.na(dataFull[, Treatment])==0), ]
for (i in 1:length(dataFull[, QUEST_ID])){
  id <- dataFull[i, QUEST_ID]  # The id number we want to match 
  idMatch <- which(not[, QUEST_ID]==id) #
  dataFull[i, Treatment := not[idMatch, Treatment] ]
}


# Variable correction and creation ----------------------------------------

# Correct typo in variable name
setnames(dataFull, old = "Q11_Mounth", new = "Q11_Month")

# Create a variable for individuals who have moved to Lebanon since November from the HH schedule
dataFull[, MovedSinceNov := ifelse((dataFull[, Q11_Year == 2013] & dataFull[, Q11_Month >= 10]) | (dataFull[, Q11_Year == 2014]), 1, 0  )]

# Create rescaled altitude variable (forcing variable)
dataFull$WINTELEV_oct13_rescaled <- dataFull$WINTELEV_oct13-500

# Fill in missing values based on skip patterns
# Q64, How many went back to syria
# Q62, are all members of your immediate family living in this village?
index <- dataFull[, Q62]==1
dataFull[index, "Q64"] <- 0

# Do you have family members living in a part of Syria that is under siege or where there is currently fighting?
dataFull[index, "Q66b1"] <- 0
index2 <- is.na(dataFull[, Q66b1])
dataFull[index2, "Q66b1"] <- 0

### Clean  cash variables. Some entries seem to be in Lebanese Lira, not dollars. 
dataFull$Q51_cleaned <- ifelse(dataFull$Q51>20000, dataFull$Q51/1500, dataFull$Q51)
dataFull$Q51_1_cleaned <- ifelse(dataFull$Q51_1>20000, dataFull$Q51_1/1500, dataFull$Q51_1)

### Create standardized versions of all the key variables
dataFull$fightingAge20_35_S <- standardize(dataFull$fightingAge20_35)
dataFull$fightingAgeMale18_45_S <- standardize(dataFull$fightingAgeMale18_45)
dataFull$Q51_S <- standardize(dataFull$Q51)
dataFull$Q51_cleaned_S <- standardize(dataFull$Q51_cleaned)
dataFull$Q51_1_S <- standardize(dataFull$Q51_1)
dataFull$Q51_1_cleaned_S <- standardize(dataFull$Q51_1_cleaned)
dataFull$Q89_S <- standardize(dataFull$Q89)
dataFull$Q90_S <- standardize(dataFull$Q90)
dataFull$Q91_1_S <- standardize(dataFull$Q91_1)
dataFull$Q64_S <- standardize(dataFull$Q64)
dataFull$Q66b1_S <- standardize(dataFull$Q66b1)

## Create variable for oldest person in the household
IDs <- sort(unique(dataFull[, QUEST_ID]))
for(i in 1:length(IDs)){
  dataFull[which(dataFull[, QUEST_ID] == IDs[i]), oldestPersonBorn := min(dataFull[which(dataFull[, QUEST_ID] == IDs[i]), Q9a])]
}

# Create variable for youngest person in the households born before 2013 (when the program started)
for(i in 1:length(IDs)){
  dataFull[which(dataFull[, QUEST_ID] == IDs[i]), youngestPersonBorn := max(dataFull[which(dataFull[, QUEST_ID] == IDs[i] & dataFull[, Q9a] < 2013), Q9a])]
}

# Returnee Variable Creation
# Add a variable indicating whether a household had
# men return who were within a specific age range

# Create a variable that returns 1
# If the house had a returnee
# and at least one returnee was a male
# and the male was in our age range
# Consider all individuals within each household

minAge <- 18
maxAge <- 50
dataFull[, "maleReturnee"] <- ifelse(
  (dataFull$Q64a_1_1==2 ) |
    (dataFull$Q64a_1_2==2 ) |
    (dataFull$Q64a_1_3==2 ) |
    (dataFull$Q64a_1_4==2 ) |
    (dataFull$Q64a_1_5==2 ) |
    (dataFull$Q64a_1_6==2 ) |
    (dataFull$Q64a_1_7==2 ) |
    (dataFull$Q64a_1_8==2 ) |
    (dataFull$Q64a_1_9==2 ) |
    (dataFull$Q64a_1_10==2), 1, 0 )

#    
gendersHolder <- (na.omit(c(dataFull$Q64a_1_1, dataFull$Q64a_1_2, 
                            dataFull$Q64a_1_3, dataFull$Q64a_1_4, 
                            dataFull$Q64a_1_5, dataFull$Q64a_1_6,
                            dataFull$Q64a_1_7, dataFull$Q64a_1_8,
                            dataFull$Q64a_1_9, dataFull$Q64a_1_10   )))
agesHolder <- (na.omit(c(dataFull$Q64a_2_1, dataFull$Q64a_2_2, 
                         dataFull$Q64a_2_3, dataFull$Q64a_2_4, 
                         dataFull$Q64a_2_5, dataFull$Q64a_2_6,
                         dataFull$Q64a_2_7, dataFull$Q64a_2_8,
                         dataFull$Q64a_2_9, dataFull$Q64a_2_10   )))

# Consider all individuals within each household
dataFull[, "maleReturnee18_45"] <- ifelse(
  (dataFull$Q64a_1_1==2 & dataFull$Q64a_2_1>=minAge & dataFull$Q64a_2_1<=maxAge) |
    (dataFull$Q64a_1_2==2 & dataFull$Q64a_2_2>=minAge & dataFull$Q64a_2_2<=maxAge) |
    (dataFull$Q64a_1_3==2 & dataFull$Q64a_2_3>=minAge & dataFull$Q64a_2_3<=maxAge) |
    (dataFull$Q64a_1_4==2 & dataFull$Q64a_2_4>=minAge & dataFull$Q64a_2_4<=maxAge) |
    (dataFull$Q64a_1_5==2 & dataFull$Q64a_2_5>=minAge & dataFull$Q64a_2_5<=maxAge) |
    (dataFull$Q64a_1_6==2 & dataFull$Q64a_2_6>=minAge & dataFull$Q64a_2_6<=maxAge) |
    (dataFull$Q64a_1_7==2 & dataFull$Q64a_2_7>=minAge & dataFull$Q64a_2_7<=maxAge) |
    (dataFull$Q64a_1_8==2 & dataFull$Q64a_2_8>=minAge & dataFull$Q64a_2_8<=maxAge) |
    (dataFull$Q64a_1_9==2 & dataFull$Q64a_2_9>=minAge & dataFull$Q64a_2_9<=maxAge) |
    (dataFull$Q64a_1_10==2 & dataFull$Q64a_2_10>=minAge & dataFull$Q64a_2_10<=maxAge), 1, 0 )


## Create variable for the number of people in the household
# Number of people in the household
numPerHH <- data.table(table(dataFull[, QUEST_ID]))
setnames(numPerHH, old = "V1", new = "QUEST_ID")
setnames(numPerHH, old = "N", new = "PostTreatFamSize")
numPerHH[, QUEST_ID:= as.numeric(numPerHH[, QUEST_ID])]
setkey(numPerHH, "QUEST_ID")
dataFull2 <- merge(dataFull, numPerHH)


#Subset to rows with primary respondents
dataFull3 <- dataFull2[which(is.na(dataFull2[, "Project"])==0),]


### Create a binary "Did anyone return to Syria?" varible
dataFull3$Q64_0_1 <- ifelse(dataFull3$Q64>0, 1, 0)



# Create logged income variables
dataFull3$Q51_log <- rep(0, dim(dataFull3)[1])
dataFull3$Q51_log[dataFull3$Q51>0] <- log(dataFull3$Q51[dataFull3$Q51>0])

dataFull3$Q51_1_log <- rep(0, dim(dataFull3)[1])
dataFull3$Q51_1_log[dataFull3$Q51_1>0] <- log(dataFull3$Q51_1[dataFull3$Q51_1>0])

dataFull3$Q51_cleaned_log <- rep(0, dim(dataFull3)[1])
dataFull3$Q51_cleaned_log[dataFull3$Q51_cleaned>0] <- log(dataFull3$Q51_cleaned[dataFull3$Q51_cleaned>0])

dataFull3$Q51_1_cleaned_log <- rep(0, dim(dataFull3)[1])
dataFull3$Q51_1_cleaned_log[dataFull3$Q51_1_cleaned>0] <- log(dataFull3$Q51_1_cleaned[dataFull3$Q51_1_cleaned>0])

## Clean the reinclusion variable. Currently it is only NAs and 1s
dataFull3$REINC <- ifelse(is.na(dataFull3$REINC), 0, 1)



####
# Cas code quality check --------------------------------------------------

# Are any households missing cas_code info
# Do all households have a cas_code value?

noCasCode <- as.numeric(names(which(table(dataFull3$QUEST_ID, is.na(dataFull3$cas_code))[, 1]==0)  )  )
noCasCodeAtLoad <- as.numeric(names(which(table(dataFull_holder1_2$QUEST_ID, is.na(dataFull_holder1_2$cas_code))[, 1]==0)  )  )

# Drop the household without cas_code entry
dataFull4_1 <- dataFull3[-which(dataFull3[, QUEST_ID] %in% noCasCode), ]
dataFull4_1_TEST <- dataFull3[-which(dataFull3[, QUEST_ID]==noCasCode), ]

####
# Merge sect data ---------------------------------------------------------

setkey(dataFull4_1, cas_code)
setkey(sect, cas_code)

dataFull4 <- merge(dataFull4_1, sect, all.x = T) 

setkey(dataFull4, surveyID)


####
# Variable Creation -------------------------------------------------------

# Q12 - Education completed
# Q55 - Region of Origin in Syria
# And Sects
# Disaggregate Q12, Q55, and sect for balance tests 

dataFull4[, Q12_0 := as.numeric(dataFull4$Q12==0)]
dataFull4[, Q12_1 := as.numeric(dataFull4$Q12==1)]
dataFull4[, Q12_2 := as.numeric(dataFull4$Q12==2)]
dataFull4[, Q12_3 := as.numeric(dataFull4$Q12==3)]
dataFull4[, Q12_4 := as.numeric(dataFull4$Q12==4)]
dataFull4[, Q12_5 := as.numeric(dataFull4$Q12==5)]
dataFull4[, Q12_6 := as.numeric(dataFull4$Q12==6)]

dataFull4[, Q55_1 := as.numeric(dataFull4$Q55==1)]
dataFull4[, Q55_2 := as.numeric(dataFull4$Q55==2)]
dataFull4[, Q55_3 := as.numeric(dataFull4$Q55==3)]
dataFull4[, Q55_4 := as.numeric(dataFull4$Q55==4)]
dataFull4[, Q55_5 := as.numeric(dataFull4$Q55==5)]
dataFull4[, Q55_6 := as.numeric(dataFull4$Q55==6)]
dataFull4[, Q55_7 := as.numeric(dataFull4$Q55==7)]
dataFull4[, Q55_8 := as.numeric(dataFull4$Q55==8)]
dataFull4[, Q55_9 := as.numeric(dataFull4$Q55==9)]
dataFull4[, Q55_10 := as.numeric(dataFull4$Q55==10)]
dataFull4[, Q55_11 := as.numeric(dataFull4$Q55==11)]
dataFull4[, Q55_12 := as.numeric(dataFull4$Q55==12)]
dataFull4[, Q55_13 := as.numeric(dataFull4$Q55==13)]
dataFull4[, Q55_14 := as.numeric(dataFull4$Q55==14)]

#### Create birthday month variable
dataFull4[, Q11_Month_1 := as.numeric(dataFull4[, Q11_Month]==1)]
dataFull4[, Q11_Month_2 := as.numeric(dataFull4$Q11_Month==2)]
dataFull4[, Q11_Month_3 := as.numeric(dataFull4$Q11_Month==3)]
dataFull4[, Q11_Month_4 := as.numeric(dataFull4$Q11_Month==4)]
dataFull4[, Q11_Month_5 := as.numeric(dataFull4$Q11_Month==5)]
dataFull4[, Q11_Month_6 := as.numeric(dataFull4$Q11_Month==6)]
dataFull4[, Q11_Month_7 := as.numeric(dataFull4$Q11_Month==7)]
dataFull4[, Q11_Month_8 := as.numeric(dataFull4$Q11_Month==8)]
dataFull4[, Q11_Month_9 := as.numeric(dataFull4$Q11_Month==9)]
dataFull4[, Q11_Month_10 := as.numeric(dataFull4$Q11_Month==10)]
dataFull4[, Q11_Month_11 := as.numeric(dataFull4$Q11_Month==11)]
dataFull4[, Q11_Month_12 := as.numeric(dataFull4$Q11_Month==12)]

# Disaggregate Sect
# 0 = Mixed, 1 = Armenian , 2 = Christian, 3 = Druze, 4 = Shia, 5 = Sunni, 6 = Other, 7 = Missing

dataFull4[, mixed := as.numeric(dataFull4$Sect_MAJ==0)]
dataFull4[, christian := as.numeric(dataFull4$Sect_MAJ==1 | dataFull4$Sect_MAJ==2)]
dataFull4[, druze := as.numeric(dataFull4$Sect_MAJ==3)]
dataFull4[, shia := as.numeric(dataFull4$Sect_MAJ==4)]
dataFull4[, sunni := as.numeric(dataFull4$Sect_MAJ==5)]
dataFull4[, other_sect := as.numeric(dataFull4$Sect_MAJ==6)]
dataFull4[, missing_sect := as.numeric(dataFull4$Sect_MAJ==7)]


####
# Migration and moving variables ------------------------------------------

# Use post treatment altitude vs pre treatment altitude 
# to measure whether the people who moved switched treatment arms
### Rename the post-treatment altitude variable
setnames(dataFull4, old = "WINTELEV.x", new = "WINTELEV_mar13")
setnames(dataFull4, old = "L4DES.x", new = "L4DES_mar13")

##  Define mover variable
dataFull4[, mover := ifelse(as.character(dataFull4[, L4DES_mar13]) == as.character(dataFull4[, L4DES_oct13]), yes = 0, no = 1)]
dataFull4[, AltMover := ifelse(dataFull4[, WINTELEV_oct13] == dataFull4[, WINTELEV_mar13], yes = 0, no = 1)]

## define movers by treatment group
##### treatment to control
dataFull4[, moverTtoC := ifelse((dataFull4[, WINTELEV_oct13] >= 500 & dataFull4[, WINTELEV_mar13] < 500), yes = 1, no = 0)]
#####  control to treatment
dataFull4[, moverCtoT := ifelse((dataFull4[, WINTELEV_oct13] < 500 & dataFull4[, WINTELEV_mar13] >= 500), yes = 1, no = 0)]
###### treatment to treatment
dataFull4[, moverTtoT := ifelse((dataFull4[, mover] == 1 & dataFull4[, WINTELEV_oct13] >= 500 & dataFull4[, WINTELEV_mar13] >= 500), yes = 1, no = 0)]
######   control to control
dataFull4[, moverCtoC := ifelse((dataFull4[, mover] == 1 & dataFull4[, WINTELEV_oct13] < 500 & dataFull4[, WINTELEV_mar13] < 500), yes = 1, no = 0)]

### Aggregate some age category variables
dataFull4[, age0to4 :=  dataFull4[, SS011ND] + dataFull4[, SS012ND] + dataFull4[, SS011D] + dataFull4[, SS012D] ]
dataFull4[, age5to12 := dataFull4[, SS013ND] + dataFull4[, SS013D] ]
dataFull4[, age13to17 := dataFull4[, SS014ND] + dataFull4[, SS015ND]  +  dataFull4[, SS014D] + dataFull4[, SS015D] ]
dataFull4[, age18to59 := dataFull4[, SS018ND] + dataFull4[, SS019ND] + dataFull4[, SS020ND] + dataFull4[, SS021ND] +    dataFull4[, SS018D] + dataFull4[, SS019D] + dataFull4[, SS020D] + dataFull4[, SS021D]]
dataFull4[, age60plus := dataFull4[, SS016ND]+dataFull4[, SS017ND]  +  dataFull4[, SS016D]+dataFull4[, SS017D] ]
dataFull4[, age0to17 := dataFull4[, age0to4] + dataFull4[, age5to12] + dataFull4[, age13to17] ]

####        Sum the disabled and not disabled categories
dataFull4[, baseline_ages0_2 := dataFull4[, SS011ND] + dataFull4[, SS011D]  ]
dataFull4[, baseline_ages3_4 := dataFull4[, SS012ND] + dataFull4[, SS012D]  ]
dataFull4[, baseline_ages5_12 := dataFull4[, SS013ND] + dataFull4[, SS013D]  ]
dataFull4[, baseline_ages13_15 := dataFull4[, SS014ND] + dataFull4[, SS014D] ]
dataFull4[, baseline_ages16_17 := dataFull4[, SS015ND] + dataFull4[, SS015D] ]
dataFull4[, baseline_ages60_70 := dataFull4[, SS016ND] + dataFull4[, SS016D] ]
dataFull4[, baseline_ages_above70 := dataFull4[, SS017ND] + dataFull4[, SS017D] ]
dataFull4[, baseline_male18_50 := dataFull4[, SS018ND] + dataFull4[, SS018D] ]
dataFull4[, baseline_female18_50 := dataFull4[, SS019ND] + dataFull4[, SS019D] ]
dataFull4[, baseline_male51_59 := dataFull4[, SS020ND] + dataFull4[, SS020D] ]
dataFull4[, baseline_female51_59 := dataFull4[, SS021ND] + dataFull4[, SS021D] ]

#### Create endline-minus-baseline variable for:
# SSFSC - baseline family size
dataFull4[, rescaled_family_size :=  dataFull4[, PostTreatFamSize] - dataFull4[, SSFSC] ]

#### Create endline-minus-baseline variable for:
# baseline_male18_50 - baseline number of men between 18 and 50
dataFull4[, rescaled_men_18_50 := dataFull4[, SumFightingAgeMale18_50] - dataFull4[, baseline_male18_50] ]

# Total number of disabled people in the HH
dataFull4[, disabled :=  (dataFull4[, SS011D] +  dataFull4[, SS012D] + dataFull4[, SS013D] + dataFull4[, SS014D] + dataFull4[, SS015D] + dataFull4[, SS016D] +  dataFull4[, SS017D] + dataFull4[, SS018D] + dataFull4[, SS019D] + dataFull4[, SS020D] +                   dataFull4[, SS021D])  ]

## Geographic control variables
## Define variables for Q55 4, 6, 10, and 11 and one for "not 4,6,10,11" geo category for "other" geographic origin
dataFull4[, origin_gov_0 := (dataFull4[, Q55]!=4 & dataFull4[, Q55]!=6 & dataFull4[, Q55]!=10 & dataFull4[, Q55]!=11)]
dataFull4[, origin_gov_4 := dataFull4[, Q55]==4]
dataFull4[, origin_gov_6 := dataFull4[, Q55]==6]
dataFull4[, origin_gov_10 := dataFull4[, Q55]==10]
dataFull4[, origin_gov_11 := dataFull4[, Q55]==11]

###  Q12: Form merged education variables
# merge 0 and 1 -- no education or primary incomplete
# educ_le_priinc
dataFull4[, educ_le_priinc := as.integer((dataFull4[, Q12]==0 | dataFull4[, Q12]==1))]
#impute a zero for the NA observation
dataFull4[which(is.na(dataFull4[, educ_le_priinc])), educ_le_priinc := 0]

# 2    primary incomplete     educ_pricom
dataFull4[, educ_pricom := as.integer(dataFull4[, Q12]==2)]
#impute a zero for the NA observation
dataFull4[which(is.na(dataFull4[, educ_pricom])), educ_pricom := 0]

#   3      educ_middle --- educ of head is middle school
dataFull4[, educ_middle := as.integer(dataFull4[, Q12]==3)]
#impute a zero for the NA observation
dataFull4[which(is.na(dataFull4[, educ_middle])), educ_middle := 0]

# 4, 5, 6   educ_sec_tech_uni --- educ of head is technical/university
dataFull4[, educ_sec_tech_uni := as.integer(dataFull4[, Q12]==4 | dataFull4[, Q12]==5 | dataFull4[, Q12]==6)]
#impute a zero for the NA observation
dataFull4[which(is.na(dataFull4[, educ_sec_tech_uni])), educ_sec_tech_uni := 0]


#    age_head --- age of head of household
#   Q9a
dataFull4[, age_head := (2014 - dataFull4[, Q9a]) ]

#	monthinleb --- months in lebanon
#		Q11_Month
#		Q11_Year
#impute missing years and months from the mean month and year

dataFull4[, Q11_Year_Imputed :=  Q11_Year]
dataFull4[, Q11_Year_Imputed :=  as.numeric(Q11_Year_Imputed)]
dataFull4[which(is.na(dataFull4[, Q11_Year_Imputed] )==1), Q11_Year_Imputed :=  as.numeric(mean(dataFull4[, Q11_Year], na.rm=TRUE))]
dataFull4[, Q11_Month_Imputed := Q11_Month]
dataFull4[, Q11_Month_Imputed := as.numeric(Q11_Month_Imputed)]
dataFull4[which(is.na(dataFull4[, Q11_Month_Imputed])==1), Q11_Month_Imputed :=  mean(dataFull4[, Q11_Month], na.rm=TRUE)]  #outputs error message but the line runs	
dataFull4[, Q11_Month_Imputed]

dataFull4[, monthinleb := ifelse(
  Q11_Year_Imputed==2014, yes = 4-dataFull4[, Q11_Month_Imputed ],
  no = ifelse(
    Q11_Year_Imputed == 2013, yes = 4 + (12-dataFull4[, Q11_Month_Imputed ]),
    no = ifelse(
      Q11_Year_Imputed == 2012, yes = 16 + (12-dataFull4[, Q11_Month_Imputed ]),
      no = ifelse(
        Q11_Year_Imputed ==2011, yes = 28 + (12-dataFull4[, Q11_Month_Imputed ]), no = 999
      )
    )
  )
)
]


#	base_relatives --- relatives in lebanon
# q58
dataFull4[, base_relatives := dataFull4[, Q58] ]
#impute a zero for NAs 
dataFull4[which(is.na(dataFull4[, base_relatives])), base_relatives := 0]

#	base_friends"--- friends in lebanon
# q60
dataFull4[, base_friends := dataFull4[, Q60] ] 


#Replace NA education value with 0
dataFull4[which(is.na(dataFull4$Q12_0)), "Q12_0"] <- 0
dataFull4[which(is.na(dataFull4$Q12_1)), "Q12_1"] <- 0
dataFull4[which(is.na(dataFull4$Q12_2)), "Q12_2"] <- 0
dataFull4[which(is.na(dataFull4$Q12_3)), "Q12_3"] <- 0
dataFull4[which(is.na(dataFull4$Q12_4)), "Q12_4"] <- 0
dataFull4[which(is.na(dataFull4$Q12_5)), "Q12_5"] <- 0
dataFull4[which(is.na(dataFull4$Q12_6)), "Q12_6"] <- 0


####
# PCA Analysis ------------------------------------------------------------

# Add PCA loadings to data
# Define the data.frame of COs for PCA and index analysis
index_variables <- as.data.frame(cbind( #-dataFull4$SumFightingAgeMale18_50,
  dataFull4$rescaled_men_18_50,
  dataFull4$Q64_0_1, 
  I(dataFull4$rescaled_men_18_50 * dataFull4$Q64_0_1),
  dataFull4$Q66b1,
  I(dataFull4$Q66b1 * dataFull4$Q64_0_1),
  dataFull4$Q89, 
  dataFull4$Q91_1, 
  dataFull4$Q51_1
))

colnames(index_variables) <- c("neg_change_men", "returnee", "change_men_return", "siege", "return_siege", 
                               "dangerous work", "Syr_for_money", "money_from_Syr")

# Standardize the variables
index_variables_ST <- apply(X = index_variables, MARGIN = 2, FUN = standardize)

# Create index variable of mean of standardized varaibles
index_st_mean <- apply(X = index_variables_ST, MARGIN = 1, FUN = mean)

# Run Principal Component Analysis
PCAoutput <- PCA(X = index_variables)

# Save a new column to the dataset, the principal component from the PCA
PCA1 <- PCAoutput$ind$coord[, "Dim.1"]
PCA2 <- PCAoutput$ind$coord[, "Dim.2"]
PCA_other <- apply(X = PCAoutput$ind$coord[, -1:-2], MARGIN = 1, FUN = mean)
coords25 <- ifelse(abs(PCAoutput$var$coord)>0.25, PCAoutput$var$coord, 0)

# Append the new columns to the dataset
dataFull4[, index_st_mean  := index_st_mean]
dataFull4[, PCA1  := PCA1]
dataFull4[, PCA2  := PCA2]
dataFull4[, PCA_other  := PCA_other]

write.csv(PCAoutput$eig[, "percentage of variance"][1], row.names = FALSE, file = "output/pca_eig1.csv")
write.csv(PCAoutput$eig[, "percentage of variance"][2], row.names = FALSE, file = "output/pca_eig2.csv")
write.csv(sum(PCAoutput$eig[, "percentage of variance"][-1:-2]), row.names = FALSE, file = "output/pca_eig_others.csv")
write.csv(PCAoutput$var$coord, file = "output/pca-variable-loadings.csv")
write.csv(coords25, file = "output/pca-variable-loadings-trimmed.csv")


####
# Define Data Frames for Analysis -----------------------------------------

# Define short outcomes data ----------------------------------------------

#### Define SHORT working variables
outcomesData_short <- data.frame(cbind(
  dataFull4$rescaled_men_18_50,
  as.numeric(dataFull4$Q64>0),
  I(dataFull4$rescaled_men_18_50 * dataFull4$Q64_0_1),
  dataFull4$Q66b1,
  I((dataFull4$Q64>0) * dataFull4$Q66b1),
  dataFull4$SS011D,
  dataFull4$SS012D,
  dataFull4$SS013D,
  dataFull4$SS014D,
  dataFull4$SS015D,
  dataFull4$SS016D,
  dataFull4$SS017D,
  dataFull4$SS018D,
  dataFull4$SS019D,
  dataFull4$SS020D,
  dataFull4$SS021D,
  dataFull4$SS011ND,
  dataFull4$SS012ND,
  dataFull4$SS013ND,         
  dataFull4$SS014ND,
  dataFull4$SS015ND,
  dataFull4$SS016ND,       
  dataFull4$SS017ND,
  dataFull4$SS018ND,
  dataFull4$SS019ND,
  dataFull4$SS020ND,
  dataFull4$SS021ND,
  dataFull4$SSFSC,   
  dataFull4$Q12_0,  # never studied
  dataFull4$Q12_1,	# 1=incomplete primary school
  dataFull4$Q12_2,	# 2 finished primary school
  dataFull4$Q12_3,	# 3 =finished middle school
  dataFull4$Q12_4,	# 4=finished secondary school
  dataFull4$Q12_5,	# 5=Technical school
  dataFull4$Q12_6,	# 6=University 
  dataFull4$Q55_1,	# 1=Damascus
  dataFull4$Q55_2,	# 2=Reef Damascus 
  dataFull4$Q55_3,	# 3=Qonaitara
  dataFull4$Q55_4,	# 4=Dar`a 
  dataFull4$Q55_5,	# 5=Suweida 
  dataFull4$Q55_6,	# 6=Homs
  dataFull4$Q55_7,	# 7=Tartous
  dataFull4$Q55_8,	# 8=Laziqiyya
  dataFull4$Q55_9,	# 9=Hama
  dataFull4$Q55_10,	# 10=Idleb
  dataFull4$Q55_11,	# 11=Aleppo
  dataFull4$Q55_12,	# 12=Raqa
  dataFull4$Q55_13,	# 13=Deir el Zoor
  dataFull4$Q55_14,	# 14=Hasaki
  dataFull4$age0to4,
  dataFull4$age5to12,
  dataFull4$age13to17,
  dataFull4$age18to59,
  dataFull4$age60plus,
  dataFull4$Oct2013Pop,
  dataFull4$disabled,
  dataFull4$origin_gov_0,
  dataFull4$origin_gov_4,
  dataFull4$origin_gov_6,
  dataFull4$origin_gov_10,
  dataFull4$origin_gov_11,
  dataFull4$educ_le_priinc,
  dataFull4$educ_pricom,
  dataFull4$educ_middle,
  dataFull4$educ_sec_tech_uni,
  dataFull4$age_head,
  dataFull4$monthinleb,
  dataFull4$base_relatives,
  dataFull4$base_friends,
  dataFull4$WINTELEV_oct13_rescaled,
  dataFull4$Treatment,
  dataFull4$cas_code,
  dataFull4$surveyID 
))



colnames(outcomesData_short) <- c(
  "DiD18_50",
  "Q64>0",
  "I(rescaled_men_18_50 * Q64_0_1)",
  "66b1",
  "returnedXundersiege",
  "SS011D",
  "SS012D",
  "SS013D",
  "SS014D",
  "SS015D",
  "SS016D",
  "SS017D",
  "SS018D",
  "SS019D",
  "SS020D",
  "SS021D",
  "SS011ND",
  "SS012ND",
  "SS013ND",         
  "SS014ND",
  "SS015ND",
  "SS016ND",       
  "SS017ND",
  "SS018ND",
  "SS019ND",
  "SS020ND",
  "SS021ND",
  "SSFSC",   
  "Q12_0",  # never studied
  "Q12_1",	# 1=incomplete primary school
  "Q12_2",	# 2 finished primary school
  "Q12_3",	# 3 =finished middle school
  "Q12_4",	# 4=finished secondary school
  "Q12_5",	# 5=Technical school
  "Q12_6",	# 6=University 
  "Q55_1",	# 1=Damascus
  "Q55_2",	# 2=Reef Damascus 
  "Q55_3",	# 3=Qonaitara
  "Q55_4",	# 4=Dar`a 
  "Q55_5",	# 5=Suweida 
  "Q55_6",	# 6=Homs
  "Q55_7",	# 7=Tartous
  "Q55_8",	# 8=Laziqiyya
  "Q55_9",	# 9=Hama
  "Q55_10",	# 10=Idleb
  "Q55_11",	# 11=Aleppo
  "Q55_12",	# 12=Raqa
  "Q55_13",	# 13=Deir el Zoor
  "Q55_14",	# 14=Hasaki
  "age0to4",
  "age5to12",
  "age13to17",
  "age18to59",
  "age60plus",
  "Oct2013Pop",
  "disabled",
  "origin_gov_0",
  "origin_gov_4",
  "origin_gov_6",
  "origin_gov_10",
  "origin_gov_11",
  "educ_le_priinc",
  "educ_pricom",
  "educ_middle",
  "educ_sec_tech_uni",
  "age_head",
  "monthinleb",
  "base_relatives",
  "base_friends",
  "WINTELEV_oct13_rescaled",
  "Treatment",
  "cas_code",
  "surveyID"
)

##### Define second short variables data frame for use in Appendix
####  Define short2 working variables
outcomesData_short2 <- data.frame(cbind(  
  dataFull4$SumFightingAgeMale18_50,
  dataFull4$Q89,
  dataFull4$Q91_1,                    
  as.numeric(dataFull4$Q51_1>0),      
  I(dataFull4$rescaled_men_18_50 * dataFull4$Q66b1),
  dataFull4$SS011D,
  dataFull4$SS012D,
  dataFull4$SS013D,
  dataFull4$SS014D,
  dataFull4$SS015D,
  dataFull4$SS016D,
  dataFull4$SS017D,
  dataFull4$SS018D,
  dataFull4$SS019D,
  dataFull4$SS020D,
  dataFull4$SS021D,
  dataFull4$SS011ND,
  dataFull4$SS012ND,
  dataFull4$SS013ND,         
  dataFull4$SS014ND,
  dataFull4$SS015ND,
  dataFull4$SS016ND,       
  dataFull4$SS017ND,
  dataFull4$SS018ND,
  dataFull4$SS019ND,
  dataFull4$SS020ND,
  dataFull4$SS021ND,
  dataFull4$SSFSC,   
  dataFull4$Q12_0,  # never studied
  dataFull4$Q12_1,	# 1=incomplete primary school
  dataFull4$Q12_2,	# 2 finished primary school
  dataFull4$Q12_3,	# 3 =finished middle school
  dataFull4$Q12_4,	# 4=finished secondary school
  dataFull4$Q12_5,	# 5=Technical school
  dataFull4$Q12_6,	# 6=University 
  dataFull4$Q55_1,	# 1=Damascus
  dataFull4$Q55_2,	# 2=Reef Damascus 
  dataFull4$Q55_3,	# 3=Qonaitara
  dataFull4$Q55_4,	# 4=Dar`a 
  dataFull4$Q55_5,	# 5=Suweida 
  dataFull4$Q55_6,	# 6=Homs
  dataFull4$Q55_7,	# 7=Tartous
  dataFull4$Q55_8,	# 8=Laziqiyya
  dataFull4$Q55_9,	# 9=Hama
  dataFull4$Q55_10,	# 10=Idleb
  dataFull4$Q55_11,	# 11=Aleppo
  dataFull4$Q55_12,	# 12=Raqa
  dataFull4$Q55_13,	# 13=Deir el Zoor
  dataFull4$Q55_14,	# 14=Hasaki
  dataFull4$age0to4,
  dataFull4$age5to12,
  dataFull4$age13to17,
  dataFull4$age18to59,
  dataFull4$age60plus,
  dataFull4$Oct2013Pop,
  dataFull4$disabled,
  dataFull4$origin_gov_0,
  dataFull4$origin_gov_4,
  dataFull4$origin_gov_6,
  dataFull4$origin_gov_10,
  dataFull4$origin_gov_11,
  dataFull4$educ_le_priinc,
  dataFull4$educ_pricom,
  dataFull4$educ_middle,
  dataFull4$educ_sec_tech_uni,
  dataFull4$age_head,
  dataFull4$monthinleb,
  dataFull4$base_relatives,
  dataFull4$base_friends,
  dataFull4$WINTELEV_oct13_rescaled,
  dataFull4$Treatment,
  dataFull4$cas_code,
  dataFull4$surveyID 
))



colnames(outcomesData_short2) <- c("SumFightingAgeMale18_50",
                                   "Q89",
                                   "Q91_1",                    
                                   "Q51_1>0",      
                                   "I(rescaled_men_18_50xQ66b1)",
                                   "SS011D",
                                   "SS012D",
                                   "SS013D",
                                   "SS014D",
                                   "SS015D",
                                   "SS016D",
                                   "SS017D",
                                   "SS018D",
                                   "SS019D",
                                   "SS020D",
                                   "SS021D",
                                   "SS011ND",
                                   "SS012ND",
                                   "SS013ND",         
                                   "SS014ND",
                                   "SS015ND",
                                   "SS016ND",       
                                   "SS017ND",
                                   "SS018ND",
                                   "SS019ND",
                                   "SS020ND",
                                   "SS021ND",
                                   "SSFSC",   
                                   "Q12_0",  # never studied
                                   "Q12_1",	# 1=incomplete primary school
                                   "Q12_2",	# 2 finished primary school
                                   "Q12_3",	# 3 =finished middle school
                                   "Q12_4",	# 4=finished secondary school
                                   "Q12_5",	# 5=Technical school
                                   "Q12_6",	# 6=University 
                                   "Q55_1",	# 1=Damascus
                                   "Q55_2",	# 2=Reef Damascus 
                                   "Q55_3",	# 3=Qonaitara
                                   "Q55_4",	# 4=Dar`a 
                                   "Q55_5",	# 5=Suweida 
                                   "Q55_6",	# 6=Homs
                                   "Q55_7",	# 7=Tartous
                                   "Q55_8",	# 8=Laziqiyya
                                   "Q55_9",	# 9=Hama
                                   "Q55_10",	# 10=Idleb
                                   "Q55_11",	# 11=Aleppo
                                   "Q55_12",	# 12=Raqa
                                   "Q55_13",	# 13=Deir el Zoor
                                   "Q55_14",	# 14=Hasaki
                                   "age0to4",
                                   "age5to12",
                                   "age13to17",
                                   "age18to59",
                                   "age60plus",
                                   "Oct2013Pop",
                                   "disabled",
                                   "origin_gov_0",
                                   "origin_gov_4",
                                   "origin_gov_6",
                                   "origin_gov_10",
                                   "origin_gov_11",
                                   "educ_le_priinc",
                                   "educ_pricom",
                                   "educ_middle",
                                   "educ_sec_tech_uni",
                                   "age_head",
                                   "monthinleb",
                                   "base_relatives",
                                   "base_friends",
                                   "WINTELEV_oct13_rescaled",
                                   "Treatment",
                                   "cas_code",
                                   "surveyID"
)


####
# Define Long Variables Dataframe -----------------------------------------

#### 1.n Define LONG working variables
outcomesData_long <- data.frame(cbind(  
  dataFull4$rescaled_men_18_50,
  as.numeric(dataFull4$Q64>0),
  I(dataFull4$rescaled_men_18_50 * dataFull4$Q64_0_1),
  dataFull4$Q66b1,
  I((dataFull4$Q64>0) * dataFull4$Q66b1),
  dataFull4$SumFightingAgeMale18_50,
  dataFull4$Q89,
  dataFull4$Q91_1,                    
  as.numeric(dataFull4$Q51_1>0),      
  I(dataFull4$rescaled_men_18_50 * dataFull4$Q66b1),
  dataFull4$PCA1,
  dataFull4$PCA2,
  dataFull4$PCA_other,
  dataFull4$SS011D,
  dataFull4$SS012D,
  dataFull4$SS013D,
  dataFull4$SS014D,
  dataFull4$SS015D,
  dataFull4$SS016D,
  dataFull4$SS017D,
  dataFull4$SS018D,
  dataFull4$SS019D,
  dataFull4$SS020D,
  dataFull4$SS021D,
  dataFull4$SS011ND,
  dataFull4$SS012ND,
  dataFull4$SS013ND,         
  dataFull4$SS014ND,
  dataFull4$SS015ND,
  dataFull4$SS016ND,       
  dataFull4$SS017ND,
  dataFull4$SS018ND,
  dataFull4$SS019ND,
  dataFull4$SS020ND,
  dataFull4$SS021ND,
  dataFull4$SSFSC,   
  dataFull4$Q12_0,  # never studied
  dataFull4$Q12_1,	# 1=incomplete primary school
  dataFull4$Q12_2,	# 2 finished primary school
  dataFull4$Q12_3,	# 3 =finished middle school
  dataFull4$Q12_4,	# 4=finished secondary school
  dataFull4$Q12_5,	# 5=Technical school
  dataFull4$Q12_6,	# 6=University 
  dataFull4$Q55_1,	# 1=Damascus
  dataFull4$Q55_2,	# 2=Reef Damascus 
  dataFull4$Q55_3,	# 3=Qonaitara
  dataFull4$Q55_4,	# 4=Dar`a 
  dataFull4$Q55_5,	# 5=Suweida 
  dataFull4$Q55_6,	# 6=Homs
  dataFull4$Q55_7,	# 7=Tartous
  dataFull4$Q55_8,	# 8=Laziqiyya
  dataFull4$Q55_9,	# 9=Hama
  dataFull4$Q55_10,	# 10=Idleb
  dataFull4$Q55_11,	# 11=Aleppo
  dataFull4$Q55_12,	# 12=Raqa
  dataFull4$Q55_13,	# 13=Deir el Zoor
  dataFull4$Q55_14,	# 14=Hasaki
  dataFull4$age0to4,
  dataFull4$age5to12,
  dataFull4$age13to17,
  dataFull4$age18to59,
  dataFull4$age60plus,
  dataFull4$Oct2013Pop,
  dataFull4$disabled,
  dataFull4$origin_gov_0,
  dataFull4$origin_gov_4,
  dataFull4$origin_gov_6,
  dataFull4$origin_gov_10,
  dataFull4$origin_gov_11,
  dataFull4$educ_le_priinc,
  dataFull4$educ_pricom,
  dataFull4$educ_middle,
  dataFull4$educ_sec_tech_uni,
  dataFull4$age_head,
  dataFull4$monthinleb,
  dataFull4$base_relatives,
  dataFull4$base_friends,
  dataFull4$WINTELEV_oct13_rescaled,
  dataFull4$Treatment,
  dataFull4$cas_code,
  dataFull4$surveyID 
))



colnames(outcomesData_long) <- c("DiD18_50",
                                 "Q64>0",
                                 "I(rescaled_men_18_50xQ64_0_1)",
                                 "66b1",
                                 "returnedXundersiege",
                                 "SumFightingAgeMale18_50",
                                 "Q89",
                                 "Q91_1",                    
                                 "Q51_1>0",      
                                 "I(rescaled_men_18_50xQ66b1)",
                                 "PCA1",
                                 "PCA2",
                                 "PCA_other",
                                 "SS011D",
                                 "SS012D",
                                 "SS013D",
                                 "SS014D",
                                 "SS015D",
                                 "SS016D",
                                 "SS017D",
                                 "SS018D",
                                 "SS019D",
                                 "SS020D",
                                 "SS021D",
                                 "SS011ND",
                                 "SS012ND",
                                 "SS013ND",         
                                 "SS014ND",
                                 "SS015ND",
                                 "SS016ND",       
                                 "SS017ND",
                                 "SS018ND",
                                 "SS019ND",
                                 "SS020ND",
                                 "SS021ND",
                                 "SSFSC",   
                                 "Q12_0",  # never studied
                                 "Q12_1",	# 1=incomplete primary school
                                 "Q12_2",	# 2 finished primary school
                                 "Q12_3",	# 3 =finished middle school
                                 "Q12_4",	# 4=finished secondary school
                                 "Q12_5",	# 5=Technical school
                                 "Q12_6",	# 6=University 
                                 "Q55_1",	# 1=Damascus
                                 "Q55_2",	# 2=Reef Damascus 
                                 "Q55_3",	# 3=Qonaitara
                                 "Q55_4",	# 4=Dar`a 
                                 "Q55_5",	# 5=Suweida 
                                 "Q55_6",	# 6=Homs
                                 "Q55_7",	# 7=Tartous
                                 "Q55_8",	# 8=Laziqiyya
                                 "Q55_9",	# 9=Hama
                                 "Q55_10",	# 10=Idleb
                                 "Q55_11",	# 11=Aleppo
                                 "Q55_12",	# 12=Raqa
                                 "Q55_13",	# 13=Deir el Zoor
                                 "Q55_14",	# 14=Hasaki
                                 "age0to4",
                                 "age5to12",
                                 "age13to17",
                                 "age18to59",
                                 "age60plus",
                                 "Oct2013Pop",
                                 "disabled",
                                 "origin_gov_0",
                                 "origin_gov_4",
                                 "origin_gov_6",
                                 "origin_gov_10",
                                 "origin_gov_11",
                                 "educ_le_priinc",
                                 "educ_pricom",
                                 "educ_middle",
                                 "educ_sec_tech_uni",
                                 "age_head",
                                 "monthinleb",
                                 "base_relatives",
                                 "base_friends",
                                 "WINTELEV_oct13_rescaled",
                                 "Treatment",
                                 "cas_code",
                                 "surveyID"
)





####
# Define Balance Variables ------------------------------------------------

#Define matrix of balance variables plus a few rows necessary for running the regressions
balance_variables <-  data.frame(cbind(
  dataFull4$SS011D,
  dataFull4$SS012D,
  dataFull4$SS013D,
  dataFull4$SS014D,
  dataFull4$SS015D,               
  dataFull4$SS016D,
  dataFull4$SS017D,
  dataFull4$SS018D,               
  dataFull4$SS019D,
  dataFull4$SS020D,
  dataFull4$SS021D,               
  dataFull4$SS011ND,
  dataFull4$SS012ND,
  dataFull4$SS013ND,              
  dataFull4$SS014ND,
  dataFull4$SS015ND,
  dataFull4$SS016ND,              
  dataFull4$SS017ND,
  dataFull4$SS018ND,
  dataFull4$SS019ND,              
  dataFull4$SS020ND,
  dataFull4$SS021ND, 
  dataFull4$SSFSC,
  dataFull4$Q9a, #Primary respondent age
  dataFull4$Q11_Month_1, #when did they household head arrive in Lebanon?
  dataFull4$Q11_Month_2,
  dataFull4$Q11_Month_3,
  dataFull4$Q11_Month_4,
  dataFull4$Q11_Month_5,
  dataFull4$Q11_Month_6,
  dataFull4$Q11_Month_7,
  dataFull4$Q11_Month_8,
  dataFull4$Q11_Month_9,
  dataFull4$Q11_Month_10,
  dataFull4$Q11_Month_11,
  dataFull4$Q11_Month_12,
  dataFull4$Q11_Year,
  #What is the level of education of the household head?
  dataFull4[, Q12_0],  # never studied
  dataFull4[, Q12_1],	# 1=incomplete primary school
  dataFull4[, Q12_2],	# 2 finished primary school
  dataFull4[, Q12_3],	# 3 =finished middle school
  dataFull4[, Q12_4],	# 4=finished secondary school
  dataFull4[, Q12_5],	# 5=Technical school
  dataFull4[, Q12_6],	# 6=University 
  #which governorate in Syria are you from?
  dataFull4[, Q55_1],	# 1=Damascus
  dataFull4[, Q55_2],	# 2=Reef Damascus 
  dataFull4[, Q55_3],	# 3=Qonaitara
  dataFull4[, Q55_4],	# 4=Dar`a 
  dataFull4[, Q55_5],	# 5=Suweida 
  dataFull4[, Q55_6],	# 6=Homs
  dataFull4[, Q55_7],	# 7=Tartous
  dataFull4[, Q55_8],	# 8=Laziqiyya
  dataFull4[, Q55_9],	# 9=Hama
  dataFull4[, Q55_10],	# 10=Idleb
  dataFull4[, Q55_11],	# 11=Aleppo
  dataFull4[, Q55_12],	# 12=Raqa
  dataFull4[, Q55_13],	# 13=Deir el Zoor
  dataFull4[, Q55_14],	# 14=Hasaki
  dataFull4$Q56, #are you from the city or the country
  dataFull4$Q57, #Did anyone in the HH have family in Lebanon before you came?
  dataFull4$mixed,
  dataFull4$christian,
  dataFull4$druze,
  dataFull4$shia,
  dataFull4$sunni,
  dataFull4$other_sect,
  dataFull4$missing_sect,
  dataFull4$Oct2013Pop,
  dataFull4$WINTELEV_oct13_rescaled,
  dataFull4$Treatment,
  dataFull4$cas_code))

#	Define the colnames
colnames(balance_variables) <- c(
  "SS011D",  #  Age between 0 --> 2 not disabled
  "SS012D",   #  Age between 3 --> 4 not disabled
  "SS013D",   # Age between 5 --> 12 not disabled
  "SS014D",  # Age between 13 --> 15 not disabled
  "SS015D",   #  Age between 16 --> 17 not disabled
  "SS016D",  # Age between 60 --> 70 not disabled
  "SS017D",  # Age between 70 --> more not disabled
  "SS018D",   # Age between 18 --> 50 Male not disabled
  "SS019D",  #  Age between 18 --> 50 Femal not disabled
  "SS020D",  # Age between 51 --> 59 Male not disabled
  "SS021D",   #  Age between 51 --> 59 Femal not disabled
  "SS011ND",  #  Age between 0 --> 2  disabled
  "SS012ND",  # Age between 3 --> 4  disabled
  "SS013ND",  # Age between 5 --> 12  disabled            
  "SS014ND", # Age between 13 --> 15  disabled
  "SS015ND", # Age between 16 --> 17  disabled
  "SS016ND",  # Age between 60 --> 70  disabled            
  "SS017ND",  # Age between 70 --> above,  disabled
  "SS018ND",  # Age between 18 --> 50 Male  disabled
  "SS019ND",   # Age between 18 --> 50 Femal  disabled           
  "SS020ND",  # Male  disabled
  "SS021ND",  # Female  disabled
  "SSFSC",
  "Q9a", #Primary respondent age
  "Q11_Month_1", #when did they household head arrive in Lebanon?
  "Q11_Month_2",
  "Q11_Month_3",
  "Q11_Month_4",
  "Q11_Month_5",
  "Q11_Month_6",
  "Q11_Month_7",
  "Q11_Month_8",
  "Q11_Month_9",
  "Q11_Month_10",
  "Q11_Month_11",
  "Q11_Month_12",
  "Q11_Year",
  #What is the level of education of the household head?
  "Q12_0",  # never studied
  "Q12_1",	# 1=incomplete primary school
  "Q12_2",	# 2 finished primary school
  "Q12_3",	# 3 =finished middle school
  "Q12_4",	# 4=finished secondary school
  "Q12_5",	# 5=Technical school
  "Q12_6",	# 6=University 
  #which governorate in Syria are you from?
  "Q55_1",	# 1=Damascus
  "Q55_2",	# 2=Reef Damascus 
  "Q55_3",	# 3=Qonaitara
  "Q55_4",	# 4=Dar`a 
  "Q55_5",	# 5=Suweida 
  "Q55_6",	# 6=Homs
  "Q55_7",	# 7=Tartous
  "Q55_8",	# 8=Laziqiyya
  "Q55_9",	# 9=Hama
  "Q55_10",	# 10=Idleb
  "Q55_11",	# 11=Aleppo
  "Q55_12",	# 12=Raqa
  "Q55_13",	# 13=Deir el Zoor
  "Q55_14",	# 14=Hasaki
  "Q56", #are you from the city or the country
  "Q57", #Did anyone in the HH have family in Lebanon before you came?
  "mixed",
  "christian",
  "druze",
  "shia",
  "sunni",
  "other_sect",
  "missing_sect",
  "Oct2013Pop", # Pre-treatment Syrian population of town where respondent lived
  "WINTELEV_oct13_rescaled",
  "Treatment",
  "cas_code")



#############
##### NB: All variable construction should be placed above this line 
#####     with edits made the dataFull data.frame. NOT data data.frame
#############

data <- dataFull4

remove(list = ls()[-which(ls()=="data" |   ls()=="outcomesData_short"  |  ls()=="outcomesData_short2" | ls()=="balance_variables" | ls()=="outcomesData_long" | ls()=="HHAidEligibility") ] )    


## Save csv
save.image(file = "data/LebanonCashData.RData")

